{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.007463,
     "end_time": "2021-02-03T20:30:06.571139",
     "exception": false,
     "start_time": "2021-02-03T20:30:06.563676",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### Wifi features\n",
    "\n",
    "This this is the code to generate the wifi features available in [this dataset](https://www.kaggle.com/devinanzelmo/indoor-navigation-and-location-wifi-features). Using these features can get a score below 14. For an example notebook using them see [this notebook](https://www.kaggle.com/devinanzelmo/wifi-features-lightgbm-starter). They only uses waypoints, wifi and timestamp data to generate solution. See this [forum post](https://www.kaggle.com/c/indoor-location-navigation/discussion/215445) for an outline of this solution method, and methods of improvement.\n",
    "\n",
    "There are `break`'s inserted into loops which need to be removed to get this to run. Right now data is written to current working directory. This takes 2-4 hours to run depending on hard drive etc. There is a lot of room for improvement speeding up feature generation. \n",
    "\n",
    "**Update:** I added one line that creates a column for the path filename, this allows for a groupkfold crossvalidation. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
    "execution": {
     "iopub.execute_input": "2021-02-03T20:30:06.590945Z",
     "iopub.status.busy": "2021-02-03T20:30:06.589984Z",
     "iopub.status.idle": "2021-02-03T20:30:06.593594Z",
     "shell.execute_reply": "2021-02-03T20:30:06.592887Z"
    },
    "papermill": {
     "duration": 0.01623,
     "end_time": "2021-02-03T20:30:06.593847",
     "exception": false,
     "start_time": "2021-02-03T20:30:06.577617",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import glob\n",
    "import os\n",
    "import gc\n",
    "import json "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-03T20:30:06.614521Z",
     "iopub.status.busy": "2021-02-03T20:30:06.613572Z",
     "iopub.status.idle": "2021-02-03T20:30:06.616669Z",
     "shell.execute_reply": "2021-02-03T20:30:06.616121Z"
    },
    "papermill": {
     "duration": 0.015585,
     "end_time": "2021-02-03T20:30:06.616837",
     "exception": false,
     "start_time": "2021-02-03T20:30:06.601252",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "base_path = '../input/indoor-location-navigation/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-03T20:30:06.639011Z",
     "iopub.status.busy": "2021-02-03T20:30:06.638118Z",
     "iopub.status.idle": "2021-02-03T20:30:09.333807Z",
     "shell.execute_reply": "2021-02-03T20:30:09.334360Z"
    },
    "papermill": {
     "duration": 2.711076,
     "end_time": "2021-02-03T20:30:09.334617",
     "exception": false,
     "start_time": "2021-02-03T20:30:06.623541",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# pull out all the buildings actually used in the test set, given current method we don't need the other ones\n",
    "ssubm = pd.read_csv('../input/indoor-location-navigation/sample_submission.csv')\n",
    "\n",
    "# only 24 of the total buildings are used in the test set, \n",
    "# this allows us to greatly reduce the intial size of the dataset\n",
    "\n",
    "ssubm_df = ssubm[\"site_path_timestamp\"].apply(lambda x: pd.Series(x.split(\"_\")))\n",
    "used_buildings = sorted(ssubm_df[0].value_counts().index.tolist())\n",
    "\n",
    "# dictionary used to map the floor codes to the values used in the submission file. \n",
    "floor_map = {\"B2\":-2, \"B1\":-1, \"F1\":0, \"F2\": 1, \"F3\":2, \"F4\":3, \"F5\":4, \"F6\":5, \"F7\":6,\"F8\":7, \"F9\":8,\n",
    "             \"1F\":0, \"2F\":1, \"3F\":2, \"4F\":3, \"5F\":4, \"6F\":5, \"7F\":6, \"8F\": 7, \"9F\":8}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-03T20:30:09.359905Z",
     "iopub.status.busy": "2021-02-03T20:30:09.359123Z",
     "iopub.status.idle": "2021-02-03T20:30:09.362909Z",
     "shell.execute_reply": "2021-02-03T20:30:09.362224Z"
    },
    "papermill": {
     "duration": 0.021272,
     "end_time": "2021-02-03T20:30:09.363069",
     "exception": false,
     "start_time": "2021-02-03T20:30:09.341797",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# get only the wifi bssid that occur over 1000 times(this number can be experimented with)\n",
    "# these will be the only ones used when constructing features\n",
    "bssid = dict()\n",
    "\n",
    "for building in used_buildings:\n",
    "    break\n",
    "    folders = sorted(glob.glob(os.path.join(base_path,'train/'+building+'/*')))\n",
    "    print(building)\n",
    "    wifi = list()\n",
    "    for folder in folders:\n",
    "        floor = floor_map[folder.split('/')[-1]]\n",
    "        files = glob.glob(os.path.join(folder, \"*.txt\"))\n",
    "        for file in files:\n",
    "            with open(file) as f:\n",
    "                txt = f.readlines()\n",
    "                for e, line in enumerate(txt):\n",
    "                    tmp = line.strip().split()\n",
    "                    if tmp[1] == \"TYPE_WIFI\":\n",
    "                        wifi.append(tmp)\n",
    "    df = pd.DataFrame(wifi)\n",
    "    #top_bssid = df[3].value_counts().iloc[:500].index.tolist()\n",
    "    value_counts = df[3].value_counts()\n",
    "    top_bssid = value_counts[value_counts > 0].index.tolist()\n",
    "    print(len(top_bssid))\n",
    "    bssid[building] = top_bssid\n",
    "    del df\n",
    "    del wifi\n",
    "    gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-03T20:30:09.383252Z",
     "iopub.status.busy": "2021-02-03T20:30:09.382581Z",
     "iopub.status.idle": "2021-02-03T20:30:09.386704Z",
     "shell.execute_reply": "2021-02-03T20:30:09.385809Z"
    },
    "papermill": {
     "duration": 0.016635,
     "end_time": "2021-02-03T20:30:09.386885",
     "exception": false,
     "start_time": "2021-02-03T20:30:09.370250",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "with open(\"bssid_1000.json\", \"w\") as f:\n",
    "    json.dump(bssid, f)\n",
    "\n",
    "with open(\"bssid_1000.json\") as f:\n",
    "    bssid = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-03T20:30:09.418284Z",
     "iopub.status.busy": "2021-02-03T20:30:09.417119Z",
     "iopub.status.idle": "2021-02-03T20:30:09.420513Z",
     "shell.execute_reply": "2021-02-03T20:30:09.419767Z"
    },
    "papermill": {
     "duration": 0.026514,
     "end_time": "2021-02-03T20:30:09.420694",
     "exception": false,
     "start_time": "2021-02-03T20:30:09.394180",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# generate all the training data \n",
    "building_dfs = dict()\n",
    "\n",
    "for building in used_buildings:\n",
    "    break\n",
    "    folders = sorted(glob.glob(os.path.join(base_path,'train', building +'/*')))\n",
    "    dfs = list()\n",
    "    index = sorted(bssid[building])\n",
    "    print(building)\n",
    "    for folder in folders:\n",
    "        floor = floor_map[folder.split('/')[-1]]\n",
    "        files = glob.glob(os.path.join(folder, \"*.txt\"))\n",
    "        print(floor)\n",
    "        for file in files:\n",
    "            wifi = list()\n",
    "            waypoint = list()\n",
    "            with open(file) as f:\n",
    "                txt = f.readlines()\n",
    "            for line in txt:\n",
    "                line = line.strip().split()\n",
    "                if line[1] == \"TYPE_WAYPOINT\":\n",
    "                    waypoint.append(line)\n",
    "                if line[1] == \"TYPE_WIFI\":\n",
    "                    wifi.append(line)\n",
    "\n",
    "            df = pd.DataFrame(np.array(wifi))    \n",
    "\n",
    "            # generate a feature, and label for each wifi block\n",
    "            for gid, g in df.groupby(0):\n",
    "                dists = list()\n",
    "                for e, k in enumerate(waypoint):\n",
    "                    dist = abs(int(gid) - int(k[0]))\n",
    "                    dists.append(dist)\n",
    "                nearest_wp_index = np.argmin(dists)\n",
    "                \n",
    "                g = g.drop_duplicates(subset=3)\n",
    "                tmp = g.iloc[:,3:5]\n",
    "                feat = tmp.set_index(3).reindex(index).replace(np.nan, -999).T\n",
    "                feat[\"x\"] = float(waypoint[nearest_wp_index][2])\n",
    "                feat[\"y\"] = float(waypoint[nearest_wp_index][3])\n",
    "                feat[\"f\"] = floor\n",
    "                feat[\"path\"] = file.split('/')[-1].split('.')[0] # useful for crossvalidation\n",
    "                dfs.append(feat)\n",
    "                \n",
    "    building_df = pd.concat(dfs)\n",
    "    building_dfs[building] = df\n",
    "    building_df.to_csv('../input/indoor-navigation-and-location-wifi-features/'+building+\"_train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-03T20:30:09.454304Z",
     "iopub.status.busy": "2021-02-03T20:30:09.451093Z",
     "iopub.status.idle": "2021-02-03T20:30:09.464308Z",
     "shell.execute_reply": "2021-02-03T20:30:09.464854Z"
    },
    "papermill": {
     "duration": 0.036471,
     "end_time": "2021-02-03T20:30:09.465079",
     "exception": false,
     "start_time": "2021-02-03T20:30:09.428608",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Generate the features for the test set\n",
    "\n",
    "ssubm_building_g = ssubm_df.groupby(0)\n",
    "feature_dict = dict()\n",
    "\n",
    "for gid0, g0 in ssubm_building_g:\n",
    "    break\n",
    "    index = sorted(bssid[g0.iloc[0,0]])\n",
    "    feats = list()\n",
    "    print(gid0)\n",
    "    for gid,g in g0.groupby(1):\n",
    "\n",
    "        # get all wifi time locations, \n",
    "        with open(os.path.join(base_path, 'test/' + g.iloc[0,1] + '.txt')) as f:\n",
    "            txt = f.readlines()\n",
    "\n",
    "        wifi = list()\n",
    "\n",
    "        for line in txt:\n",
    "            line = line.strip().split()\n",
    "            if line[1] == \"TYPE_WIFI\":\n",
    "                wifi.append(line)\n",
    "\n",
    "        wifi_df = pd.DataFrame(wifi)\n",
    "        wifi_points = pd.DataFrame(wifi_df.groupby(0).count().index.tolist())\n",
    "        \n",
    "        for timepoint in g.iloc[:,2].tolist():\n",
    "\n",
    "            deltas = (wifi_points.astype(int) - int(timepoint)).abs()\n",
    "            min_delta_idx = deltas.values.argmin()\n",
    "            wifi_block_timestamp = wifi_points.iloc[min_delta_idx].values[0]\n",
    "            \n",
    "            wifi_block = wifi_df[wifi_df[0] == wifi_block_timestamp].drop_duplicates(subset=3)\n",
    "            feat = wifi_block.set_index(3)[4].reindex(index).fillna(-999)\n",
    "\n",
    "            feat['site_path_timestamp'] = g.iloc[0,0] + \"_\" + g.iloc[0,1] + \"_\" + timepoint\n",
    "            feats.append(feat)\n",
    "    feature_df = pd.concat(feats, axis=1).T\n",
    "    feature_df.to_csv('../input/indoor-navigation-and-location-wifi-features/'+gid0+\"_test.csv\")\n",
    "    feature_dict[gid0] = feature_df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 9.894085,
   "end_time": "2021-02-03T20:30:10.083699",
   "environment_variables": {},
   "exception": null,
   "input_path": "__notebook__.ipynb",
   "output_path": "__notebook__.ipynb",
   "parameters": {},
   "start_time": "2021-02-03T20:30:00.189614",
   "version": "2.2.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
